First, let’s load the necessary libraries and data that will allow us to begin our investigation!
## Libraries to include
library(tidyverse)
library(lubridate)
## Load the data
# Replace the path below with the path to where your data lives
data_path <- "https://datajournalism.tech/wp-content/uploads/2019/10/wichita.csv"
stops <- read_csv(data_path)
# Additional data and fixed values we'll be using
population_2016 <- tibble(subject_race = c("asian/pacific islander", "black", "hispanic", "other/unknown","white"),
num_people = c(19294, 42485, 65090, 16686, 245499)) %>%
mutate(subject_race = as.factor(subject_race))
center_lat <- 37.689820
center_lng <- -97.336454
colnames(stops)
## [1] "X1" "raw_row_number"
## [3] "date" "time"
## [5] "location" "lat"
## [7] "lng" "subject_age"
## [9] "subject_race" "subject_sex"
## [11] "type" "disposition"
## [13] "violation" "citation_issued"
## [15] "outcome" "posted_speed"
## [17] "vehicle_color" "vehicle_make"
## [19] "vehicle_model" "vehicle_year"
## [21] "raw_defendant_race" "raw_defendant_ethnicity"
nrow(stops)
## [1] 57750
str(stops)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 57750 obs. of 22 variables:
## $ X1 : num 1 2 3 4 5 6 7 8 9 10 ...
## $ raw_row_number : chr "923578" "923657" "912091" "923680" ...
## $ date : Date, format: "2016-01-01" "2016-01-01" ...
## $ time : 'hms' num 18:00:00 18:08:00 18:11:00 18:13:00 ...
## ..- attr(*, "units")= chr "secs"
## $ location : chr "N WEST ST, KS, 67205" "8000 W 13TH ST N, WICHITA, KS, 67212" "500 S LIMUEL ST, WICHITA, KS, 67235" "7600 W 21ST ST N, WICHITA, KS, 67205" ...
## $ lat : num 37.7 37.7 37.7 37.7 37.7 ...
## $ lng : num -97.4 -97.4 -97.5 -97.4 -97.4 ...
## $ subject_age : num 16 44 20 21 28 27 15 20 23 NA ...
## $ subject_race : chr "white" "white" "white" "hispanic" ...
## $ subject_sex : chr "female" "male" "male" "female" ...
## $ type : chr "vehicular" "vehicular" "vehicular" "vehicular" ...
## $ disposition : chr "DISMISSED" "GUILTY (IVR)" "DISMISSED WITH PREJUDICE; DISMISSED WITH PREJUDICE" "GUILTY" ...
## $ violation : chr "RUN STOP SIGN" "SPEED OVER LIMIT" "DUI; INATTENTIVE DRIVING" "SPEED OVER LIMIT" ...
## $ citation_issued : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ outcome : chr "citation" "citation" "citation" "citation" ...
## $ posted_speed : num NA 40 NA 40 40 40 NA NA NA NA ...
## $ vehicle_color : chr "BURGUNDY OR MAROON" "\"ALUMINUM, SILVER\"" "WHITE" "\"ALUMINUM, SILVER\"" ...
## $ vehicle_make : chr "JEEP (1989 TO PRESENT)" "HYUNDAI" "HONDA" "TOYOTA" ...
## $ vehicle_model : chr NA "TUCSON" NA NA ...
## $ vehicle_year : num 2008 NA NA NA NA ...
## $ raw_defendant_race : chr "W" "W" "W" "W" ...
## $ raw_defendant_ethnicity: chr "N" "N" "N" "H" ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_double(),
## .. raw_row_number = col_character(),
## .. date = col_date(format = ""),
## .. time = col_time(format = ""),
## .. location = col_character(),
## .. lat = col_double(),
## .. lng = col_double(),
## .. subject_age = col_double(),
## .. subject_race = col_character(),
## .. subject_sex = col_character(),
## .. type = col_character(),
## .. disposition = col_character(),
## .. violation = col_character(),
## .. citation_issued = col_logical(),
## .. outcome = col_character(),
## .. posted_speed = col_double(),
## .. vehicle_color = col_character(),
## .. vehicle_make = col_character(),
## .. vehicle_model = col_character(),
## .. vehicle_year = col_double(),
## .. raw_defendant_race = col_character(),
## .. raw_defendant_ethnicity = col_character()
## .. )
summary(stops)
## X1 raw_row_number date time
## Min. : 1 Length:57750 Min. :2016-01-01 Length:57750
## 1st Qu.:14438 Class :character 1st Qu.:2016-03-16 Class1:hms
## Median :28876 Mode :character Median :2016-05-29 Class2:difftime
## Mean :28876 Mean :2016-06-10 Mode :numeric
## 3rd Qu.:43313 3rd Qu.:2016-08-31
## Max. :57750 Max. :2016-12-31
##
## location lat lng subject_age
## Length:57750 Min. :37.47 Min. :-101.36 Min. :11.00
## Class :character 1st Qu.:37.67 1st Qu.: -97.37 1st Qu.:24.00
## Mode :character Median :37.69 Median : -97.34 Median :33.00
## Mean :37.69 Mean : -97.33 Mean :36.71
## 3rd Qu.:37.70 3rd Qu.: -97.28 3rd Qu.:48.00
## Max. :38.48 Max. : -96.75 Max. :99.00
## NA's :1167 NA's :1167 NA's :10128
## subject_race subject_sex type
## Length:57750 Length:57750 Length:57750
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## disposition violation citation_issued outcome
## Length:57750 Length:57750 Mode:logical Length:57750
## Class :character Class :character TRUE:57750 Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## posted_speed vehicle_color vehicle_make vehicle_model
## Min. : 20.00 Length:57750 Length:57750 Length:57750
## 1st Qu.: 30.00 Class :character Class :character Class :character
## Median : 40.00 Mode :character Mode :character Mode :character
## Mean : 39.93
## 3rd Qu.: 40.00
## Max. :304.00
## NA's :35149
## vehicle_year raw_defendant_race raw_defendant_ethnicity
## Min. :1962 Length:57750 Length:57750
## 1st Qu.:2001 Class :character Class :character
## Median :2005 Mode :character Mode :character
## Mean :2005
## 3rd Qu.:2009
## Max. :2999
## NA's :43236
# This method uses the group_by/summarize paradigm
stops %>%
group_by(subject_race) %>%
summarize(
n = n(),
prop = n / nrow(.)
)
## # A tibble: 5 x 3
## subject_race n prop
## <chr> <int> <dbl>
## 1 asian/pacific islander 1607 0.0278
## 2 black 8038 0.139
## 3 hispanic 6709 0.116
## 4 other/unknown 9335 0.162
## 5 white 32061 0.555
``
We saw before that over two-thirds of stops were of black drivers. The by-race stop counts are only meaningful, though, when compared to some baseline. If the Philadelphia population was about two-thirds black, then two-thirds of stops being of black drivers wouldn’t be at all surprising.
In order to do this baseline comparison, we need to understand the racial demographics in our Philly population data. The data as we’ve given it to you has raw population numbers. To make it useful, we’ll need to compute the proportion of Philadelphia residents in each demographic group. (Hint: use the mutate() function.)
population_2016 %>%
mutate(prop = num_people / sum(num_people))
## # A tibble: 5 x 3
## subject_race num_people prop
## <fct> <dbl> <dbl>
## 1 asian/pacific islander 19294 0.0496
## 2 black 42485 0.109
## 3 hispanic 65090 0.167
## 4 other/unknown 16686 0.0429
## 5 white 245499 0.631
stop_final <- stops %>%
count(subject_race) %>%
left_join(
population_2016,
by = "subject_race"
) %>%
mutate(stop_rate = n / num_people)
## Warning: Column `subject_race` joining character vector and factor,
## coercing into character vector
bar <- ggplot(stop_final,
aes(x=reorder(subject_race,stop_rate), y=stop_rate))+
geom_bar(stat="identity",
position="identity",
fill="red")+
geom_hline(yintercept = 0) +
labs(title="Stopped Drivers by Race",
subtitle = "African American drivers were stopped more than White people in the city of Wichita,Kansas")+
coord_flip()
options(scipen=10000)
bar
library(leaflet)
library(httpuv)
#Step 1. Create a color palette of your choice.
race <- colorFactor(c("beige", "black", "orange", "darkgreen", "blue"),
domain=c("white", "black", "asian/pacific islander", "hispanic", "other/unknown"),
ordered = TRUE)
#Step 3. Drop missing data values.
f <- stops %>% drop_na(lat, lng)
#Step 4. Map the data set.
leaflet(f) %>%
addProviderTiles(providers$CartoDB) %>%
setView(lng= -97.336454, lat= 37.689820, zoom=16) %>%
addCircleMarkers(~lng,
~lat,
popup=paste("This is a/an", f$subject_race, "and", f$subject_sex, "driver."),
weight= 3,
radius=4,
color=~race(subject_race),
stroke=F,
fillOpacity = 1)